Data Inspection¶

In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm

df_list = list()
chunk_iter = pd.read_csv(
    "../data/Total_Data_10Y_Top24.csv", 
    chunksize=100000, 
    dtype = {
        "CANCELLATION_CODE": str,
    }
)
for chunk in tqdm(chunk_iter):
    df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:29,  4.44it/s]
Out[2]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... TAXI_IN ARR_TIME ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 28.0 1214.0 9.0 0.0 NaN NaN NaN NaN NaN NaN
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 13.0 945.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... 6.0 1341.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 7.0 1159.0 4.0 0.0 NaN NaN NaN NaN NaN NaN
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 6.0 2317.0 2.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 24 columns

Data Cleaning¶

In [4]:
import pandas as pd

df.dropna(subset = ['DEP_TIME', 'ARR_TIME'], inplace = True)

def convert_to_timestamp(date, start_time, end_time, return_time = "start"):
    start_time_str = str(int(start_time)).zfill(4)
    start_hour = int(start_time_str[:2])
    start_minute = int(start_time_str[2:])

    end_time_str = str(int(end_time)).zfill(4)
    end_hour = int(end_time_str[:2])
    end_minute = int(end_time_str[2:])

    try:
        if return_time == "start":
            if start_hour == 24:
                date = date + pd.DateOffset(days=1)
                start_hour = start_hour%24
            return pd.Timestamp(
                year=date.year, 
                month=date.month, 
                day=date.day, 
                hour=start_hour, 
                minute=start_minute
            )
        else:
            if end_time < start_time or end_hour == 24:
                date = date + pd.DateOffset(days=1)
                end_hour = end_hour%24
            return pd.Timestamp(
                year=date.year, 
                month=date.month, 
                day=date.day, 
                hour=end_hour, 
                minute=end_minute
            )
    except:
        print(start_time, end_time)
        return None

df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['Start_Timestamp'] = df.apply(lambda row: convert_to_timestamp(row['FL_DATE'], row['DEP_TIME'], row['ARR_TIME'], "start"), axis=1)
df['End_Timestamp'] = df.apply(lambda row: convert_to_timestamp(row['FL_DATE'], row['DEP_TIME'], row['ARR_TIME'], "end"), axis=1)

df.head()
Out[4]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Start_Timestamp End_Timestamp
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 9.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 10:25:00 2014-07-01 12:14:00
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 0.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 06:21:00 2014-07-01 09:45:00
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... -9.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 12:38:00 2014-07-01 13:41:00
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 4.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 09:04:00 2014-07-01 11:59:00
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 2.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 20:16:00 2014-07-01 23:17:00

5 rows × 26 columns

In [5]:
df.columns
Out[5]:
Index(['FL_DATE', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN_AIRPORT_SEQ_ID',
       'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
       'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
       'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY',
       'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'Start_Timestamp',
       'End_Timestamp'],
      dtype='object')
In [6]:
import matplotlib.pyplot as plt
from datetime import time

target_df = df.drop(columns = [
    "FL_DATE",
    "OP_UNIQUE_CARRIER",
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "DEST_AIRPORT_SEQ_ID",
    "DEST_CITY_MARKET_ID",
    'DEP_TIME',
    'ARR_TIME',
    "TAXI_OUT",
    "TAXI_IN",
])
del df

Visualization¶

Route¶

In [7]:
target_df['Route'] = target_df.apply(lambda row: row['ORIGIN_CITY_NAME'] + " -> " + row['DEST_CITY_NAME'], axis = 1)
target_df.head()
Out[7]:
ORIGIN ORIGIN_CITY_NAME DEST DEST_CITY_NAME DEP_DELAY ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Start_Timestamp End_Timestamp Route
0 CLT Charlotte, NC DFW Dallas/Fort Worth, TX -5.0 9.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 10:25:00 2014-07-01 12:14:00 Charlotte, NC -> Dallas/Fort Worth, TX
1 DFW Dallas/Fort Worth, TX CLT Charlotte, NC -4.0 0.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 06:21:00 2014-07-01 09:45:00 Dallas/Fort Worth, TX -> Charlotte, NC
2 ATL Atlanta, GA DFW Dallas/Fort Worth, TX -2.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 12:38:00 2014-07-01 13:41:00 Atlanta, GA -> Dallas/Fort Worth, TX
3 DFW Dallas/Fort Worth, TX ATL Atlanta, GA 14.0 4.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 09:04:00 2014-07-01 11:59:00 Dallas/Fort Worth, TX -> Atlanta, GA
4 DFW Dallas/Fort Worth, TX ATL Atlanta, GA 1.0 2.0 0.0 NaN NaN NaN NaN NaN NaN 2014-07-01 20:16:00 2014-07-01 23:17:00 Dallas/Fort Worth, TX -> Atlanta, GA
In [33]:
##### target_airport_df_temp = target_airport_df.groupby('Route')['DEP_DELAY'].quantile(0.75).reset_index()
#target_airport_df_temp.sort_values("DEP_DELAY", ascending = False, inplace = True)
#target_airport_df_temp = target_airport_df_temp.iloc[:10, :]

'''
target_df_temp = target_df.groupby('Route').size().reset_index().rename(columns = {0: 'Route Use'})
target_df_temp.sort_values("Route Use", ascending = False, inplace = True)
target_df_temp = target_df_temp.iloc[:50, :]
target_df_temp
'''

target_df_temp = target_df.groupby('Route')["DEP_DELAY"].agg(Median_Delay = lambda x:x.quantile(0.5), Route_Use = "size").reset_index()
target_df_temp['Daily_Route_Use'] = target_df_temp.Route_Use/3654
#target_df_temp.sort_values("Median", ascending = False, inplace = True)
target_df_temp
Out[33]:
Route Median_Delay Route_Use Daily_Route_Use
0 Atlanta, GA -> Charlotte, NC -2.0 56939 15.582649
1 Atlanta, GA -> Chicago, IL -1.0 63768 17.451560
2 Atlanta, GA -> Dallas, TX -1.0 31748 8.688560
3 Atlanta, GA -> Dallas/Fort Worth, TX -1.0 63436 17.360701
4 Atlanta, GA -> Denver, CO -1.0 55613 15.219759
... ... ... ... ...
499 St. Louis, MO -> San Francisco, CA 0.0 5637 1.542693
500 St. Louis, MO -> San Jose, CA 5.0 1399 0.382868
501 St. Louis, MO -> San Juan, PR 7.0 108 0.029557
502 St. Louis, MO -> Santa Ana, CA -1.0 724 0.198139
503 St. Louis, MO -> Seattle, WA -2.0 7261 1.987137

504 rows × 4 columns

In [37]:
import plotly.express as px

fig = px.scatter(target_df_temp, x = "Median_Delay", y = "Daily_Route_Use", title = "Median Delay vs Daily Route Use")
fig.show()
In [12]:
top5_delayed_route = pd.merge(
    target_df, 
    target_df_temp, 
    how = "inner", 
    on = 'Route',
)
In [18]:
import plotly.express as px
import numpy as np

np.random.seed(1)

try:
    top5_delayed_route = top5_delayed_route.sample(10000, replace = False)
except:
    print("Too Few Rows")
fig = px.box(
    top5_delayed_route.sort_values("DEP_DELAY", ascending = False), 
    x="Route", 
    y="DEP_DELAY",
    log_y = True,
    color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
    height=600, 
    title_text=f"Route Departure Delay",
    showlegend=True
)
fig.show()
Too Few Rows
for row in Each Row of Flight_Records:
    Baseline = row.Arrival_Time - row.Departure_Time + row.Arrival_Delay

    Optional_Flights = Filter(Flight_Records by Flight_Date +/- 1 Day and (Same Destination or Same Origin))

    Optional_Flights = Optional_Flights JOIN Optional_Flights ON Mid_Point

    Optional_Flights = Filter(Optional_Flights Transit_Time >= 60 min)

    Optional_Flights.New_Time = Optional_Flights.Arrival_Time_2nd \
                            - Optional_Flights.Departure_Time_1st \
                            + Optional_Flights.Arrival_Delay_2nd

    Optional_Routes = Group Optional_Flights by Routes

    Possible_Routes = Filter(Optional_Routes by New_Time.Median < Baseline)

    return the Mean, Median, SD of Possible_Suggestion
In [52]:
from datetime import timedelta
from tqdm import tqdm
import numpy as np
import json

def improve(dep_time, arr_time, arr_delay, origin, dest, df):
    # Give flight selection window
    temp_df = df[
        (df['Start_Timestamp'] >= (dep_time - timedelta(days = 1))) \
        & (df['Start_Timestamp'] <= (dep_time + timedelta(days = 1)))
    ]
    temp_df = temp_df[
        (temp_df.ORIGIN_CITY_NAME == origin) \
        | (temp_df.DEST_CITY_NAME == dest)
    ]

    # All Possible Alternative Routes
    route_suggest = pd.merge(
        temp_df, temp_df, 
        how = "inner", left_on = "DEST_CITY_NAME", right_on = "ORIGIN_CITY_NAME", 
        suffixes = ["_1st", "_2nd"]
    )
    route_suggest = route_suggest[
        (route_suggest['ORIGIN_CITY_NAME_1st'] == origin) \
        & (route_suggest['DEST_CITY_NAME_2nd'] == dest)
    ]
    if len(route_suggest) == 0: return None

    # Give 1 Hour Transit Time
    route_suggest = route_suggest[
        ((route_suggest['Start_Timestamp_2nd'] - route_suggest['End_Timestamp_1st']).apply(lambda x:x.total_seconds() // 60)) \
        > (route_suggest['ARR_DELAY_1st'] + 60)
    ]
    if len(route_suggest) == 0: return None

    # Alternative Route Time
    route_suggest['new_time'] = route_suggest.apply(
        lambda row: row['ARR_DELAY_2nd'],
        axis = 1
    )
    if len(route_suggest) == 0: return None

    # Alternative Route Consistency
    output = route_suggest.groupby(
        ['Route_1st']
    )['new_time'].agg(['median', 'mean', 'std', 'max', 'min']).reset_index()
    output.sort_values("median", inplace = True)
    output = output[output['median'] < arr_delay]
    if len(output) == 0: return None
        
    output['Original_Departure_Time'] = dep_time
    output['Original_ORIGIN_CITY'] = origin
    output['Original_DEST_CITY'] = dest
    output['Original_Time_Taken'] = arr_delay
    return output

routes_df_list = list()
for i, row in tqdm(top5_delayed_route.iterrows()):
    route = improve(
        row["Start_Timestamp"], 
        row["End_Timestamp"], 
        row["ARR_DELAY"], 
        row["ORIGIN_CITY_NAME"], 
        row["DEST_CITY_NAME"], 
        target_df
    )
    if route is not None:
        routes_df_list.append(route)

pd.concat(routes_df_list)
7092it [12:18,  9.61it/s]
Out[52]:
Route_1st median mean std max min Original_Departure_Time Original_ORIGIN_CITY Original_DEST_CITY Original_Time_Taken
3 Houston, TX -> Denver, CO -14.0 11.777778 30.568684 44.0 -14.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
6 Houston, TX -> New Orleans, LA -12.0 -4.076923 18.895339 33.0 -16.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
1 Houston, TX -> Charlotte, NC -11.0 2.076923 20.076776 33.0 -17.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
10 Houston, TX -> Raleigh/Durham, NC -2.0 10.444444 43.090344 114.0 -24.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
0 Houston, TX -> Atlanta, GA 3.0 6.031250 20.286025 56.0 -14.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
... ... ... ... ... ... ... ... ... ... ...
0 Houston, TX -> Atlanta, GA -12.0 -10.020619 6.864734 3.0 -21.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
10 Houston, TX -> Raleigh/Durham, NC -10.0 -6.888889 12.910957 16.0 -22.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
6 Houston, TX -> Miami, FL -10.0 -5.818182 32.771384 79.0 -36.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
3 Houston, TX -> Denver, CO -9.0 -6.846154 14.006650 19.0 -23.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
12 Houston, TX -> Santa Ana, CA -8.0 -8.000000 NaN -8.0 -8.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0

44859 rows × 10 columns

In [53]:
alt_routes = pd.concat(routes_df_list)
merged_temp_df = pd.merge(
    top5_delayed_route, 
    alt_routes, 
    how = "left",
    left_on=['Start_Timestamp', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME'], 
    right_on=['Original_Departure_Time', 'Original_ORIGIN_CITY', 'Original_DEST_CITY']
)
merged_temp_df
Out[53]:
ORIGIN ORIGIN_CITY_NAME DEST DEST_CITY_NAME DEP_DELAY ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY ... Route_1st median mean std max min Original_Departure_Time Original_ORIGIN_CITY Original_DEST_CITY Original_Time_Taken
0 HOU Houston, TX JFK New York, NY -11.0 -37.0 0.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaT NaN NaN NaN
1 HOU Houston, TX JFK New York, NY -8.0 27.0 0.0 NaN 0.0 0.0 ... Houston, TX -> Denver, CO -14.0 11.777778 30.568684 44.0 -14.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
2 HOU Houston, TX JFK New York, NY -8.0 27.0 0.0 NaN 0.0 0.0 ... Houston, TX -> New Orleans, LA -12.0 -4.076923 18.895339 33.0 -16.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
3 HOU Houston, TX JFK New York, NY -8.0 27.0 0.0 NaN 0.0 0.0 ... Houston, TX -> Charlotte, NC -11.0 2.076923 20.076776 33.0 -17.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
4 HOU Houston, TX JFK New York, NY -8.0 27.0 0.0 NaN 0.0 0.0 ... Houston, TX -> Raleigh/Durham, NC -2.0 10.444444 43.090344 114.0 -24.0 2014-07-01 18:12:00 Houston, TX New York, NY 27.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
46522 HOU Houston, TX JFK New York, NY 1.0 -7.0 0.0 NaN NaN NaN ... Houston, TX -> Miami, FL -10.0 -5.818182 32.771384 79.0 -36.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
46523 HOU Houston, TX JFK New York, NY 1.0 -7.0 0.0 NaN NaN NaN ... Houston, TX -> Denver, CO -9.0 -6.846154 14.006650 19.0 -23.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
46524 HOU Houston, TX JFK New York, NY 1.0 -7.0 0.0 NaN NaN NaN ... Houston, TX -> Santa Ana, CA -8.0 -8.000000 NaN -8.0 -8.0 2018-12-31 11:39:00 Houston, TX New York, NY -7.0
46525 JFK New York, NY SMF Sacramento, CA 0.0 -23.0 0.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaT NaN NaN NaN
46526 EWR Newark, NJ SMF Sacramento, CA 0.0 -30.0 0.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaT NaN NaN NaN

46527 rows × 27 columns

In [66]:
out = merged_temp_df.groupby(['Route']).agg({
    "median" : ('median', 'mean'),
    'Route_1st' : lambda x: sum(~x.isna()/len(x)*100)
}).reset_index()
In [68]:
out.columns = ['Route', 'Alternative Arr Delay Median', 'Alternative Arr Delay Mean', 'Example Route Reliability (%)']
out
Out[68]:
Route Alternative Arr Delay Median Alternative Arr Delay Mean Example Route Reliability (%)
0 Houston, TX -> New York, NY -9.5 -7.283299 97.255575
1 Miami, FL -> Portland, OR -9.0 -8.605536 99.029126
2 New York, NY -> Sacramento, CA -5.0 -4.071500 94.410786
3 Newark, NJ -> Sacramento, CA -5.0 -3.935500 97.435492
4 San Juan, PR -> New Orleans, LA -5.0 -2.579310 99.315068
In [ ]: